tcu_cleaned <- tcu %>%
#select(-OriginationPurpose, -OrigBehavioralSegment, -OrigEngagementSegment) %>%
mutate(OriginationMonth = as.Date(OriginationMonth),
ChangeEngagementSegment = as.factor(ChangeEngagementSegment),
ChangeBehavioralSegment = as.factor(ChangeBehavioralSegment),
Converted = ifelse(EngagementChangeMonth == '',0,1), #engineered our label (dependent variable)
Converted = as.factor(Converted),
BKHist = as.factor(BKHist),
CollHist = as.factor(CollHist),
CollHist2Yr = as.factor(CollHist2Yr),
PublicHist2Yr = as.factor(PublicHist2Yr),
Delq30Hist2Yr = as.factor(Delq30Hist2Yr),
Delq60Hist2Yr = as.factor(Delq60Hist2Yr),
Delq90Hist2Yr = as.factor(Delq90Hist2Yr),
OrigHouseholdWasRetail = as.factor(as.character(OrigHouseholdWasRetail)),
BKHist = as.factor(BKHist),
CollHist = as.factor(CollHist),
CollHist2Yr = as.factor(CollHist2Yr),
PublicHist2Yr = as.factor(PublicHist2Yr))
tcu_cleaned <- tcu_cleaned %>%
mutate(EngagementChangeMonth = as.Date(EngagementChangeMonth, '%Y/%m/%d'),
WelcomeLetter = as.Date(WelcomeLetter, '%Y/%m/%d'),
PreapprovalLetter = as.Date(PreapprovalLetter, '%Y/%m/%d'),
PreapprovalMessage = as.Date(PreapprovalMessage,'%Y/%m/%d'),
PreapprovalConvo = as.Date(PreapprovalConvo, '%Y/%m/%d'),
WelcomeMessage = as.Date(WelcomeMessage, '%Y/%m/%d'),
WelcomeConvo = as.Date(WelcomeConvo, '%Y/%m/%d'))
Check NAs.
## Check NAs
sapply(tcu_cleaned, function(x) sum(is.na(x)))
OriginationMonth BaseXPMembershipID
0 0
OriginationPurpose OrigBehavioralSegment
0 0
OrigEngagementSegment OrigZip
0 2
NbrIndirectLoans OrigBalanceIndirectLoans
0 0
OrigHousehold AgeAtOrigination
0 4
DecisionCreditScore MonthlyIncome
0 0
BorrowerCreditScore BankruptcyTotal
94082 17505
CollectionTotal OtherPublicTotal
17505 17505
BKHist CollHist
17505 17505
CollHist2Yr PublicHist2Yr
17505 17505
Delq30Hist2Yr Delq60Hist2Yr
17505 17505
Delq90Hist2Yr TrdDeptStore
17505 17505
TrdBankCard TrdBank
17505 17505
TrdAuto TrdOthFin
17505 17505
TrdOil TrdMortgage
17505 17505
TrdOther Employer1Name
17505 22
HousePaymentRentalAmount OtherExpense
9324 9324
InitialLTV WelcomeLetter
9402 96228
PreapprovalLetter PreapprovalMessage
138931 183129
PreapprovalConvo WelcomeMessage
182852 183245
WelcomeConvo EngagementChangeMonth
182713 179354
ChangeEngagementSegment ChangeBehavioralSegment
0 0
OrigHouseholdWasRetail VehiclesUsed
0 4
VehiclesNew WghtAvgContractRate
4 4
AvgTerm AvgDaysToFirstPayment
4 4
SumScheduledPmts SumLoanPaymentTotals
4 10
DisabilityInsPolices ExtendedWarranties
4 4
GapWaivers JointLifeInsPolicies
4 4
LifeInsPolicies Rebook
4 4
VehicleMake VehicleModel
0 0
VehicleYear Converted
58 0
## Remove Age Outliers
tcu_cleaned <- tcu_cleaned %>%
filter(AgeAtOrigination >= 18) %>%
filter(!is.na(AgeAtOrigination))
credit_score_outliers <- tcu_cleaned %>%
filter(DecisionCreditScore < 300 | DecisionCreditScore > 850) %>% #The normal range of credit score in the US is from 300 to 850, we see there are around 2000+ customers' decision credit scores are out of the range
mutate(DecisionCreditScore = ifelse(!is.na(BorrowerCreditScore), BorrowerCreditScore, DecisionCreditScore)) # so we decide to use their borrower credit score to replace decision credit score with borrower if exist
mean_credit_score <- tcu_cleaned %>%
filter(Converted == "1") %>%
filter(DecisionCreditScore >= 300 & DecisionCreditScore <= 850) %>%
summarize(mean_score = round(mean(DecisionCreditScore)))
unlist(mean_credit_score)
mean_score
729
credit_score_outliers <- credit_score_outliers %>% #impute credit score for converted customers
filter(!is.na(EngagementChangeMonth)) %>% ## if the decision credit score is less than 300, I think we can use the mean of the converted customers
mutate(DecisionCreditScore = ifelse(DecisionCreditScore < 300, unlist(mean_credit_score), DecisionCreditScore),
DecisionCreditScore = ifelse(DecisionCreditScore > 850, 850, DecisionCreditScore)) ## concerns: we have a lot of 300 engaged customers, which is hard to believe that those low-credit loan customers were actually engaged
tcu_cleaned<- tcu_cleaned %>%
filter(DecisionCreditScore >= 300 & DecisionCreditScore <= 850)
## Use rbind to merge the two dfs
tcu_cleaned_new = rbind(tcu_cleaned, credit_score_outliers)
summary(tcu_cleaned_new$DecisionCreditScore)
Min. 1st Qu. Median Mean 3rd Qu. Max.
380.0 705.0 752.0 751.5 806.0 850.0
tcu_cleaned_income1 <- tcu_cleaned_new %>%
filter(MonthlyIncome >= 2000)
#length(unique(tcu_cleaned_income1$OrigHousehold))
anti_join_tcu <- tcu_cleaned_income1%>%
filter(MonthlyIncome >= 20000 & MonthlyIncome < 50000)
tcu_cleaned_income_2 <- tcu_cleaned_income1 %>%
anti_join(anti_join_tcu, by = "BaseXPMembershipID")
tcu_cleaned_income_3 <- tcu_cleaned_income_2 %>%
mutate(MonthlyIncome = ifelse(MonthlyIncome >= 50000, MonthlyIncome/12, MonthlyIncome)) %>%
filter(MonthlyIncome <= 20000)
tcu_cleaned_4 <- tcu_cleaned_income_3 %>%
mutate(BankruptcyTotal = ifelse(is.na(BankruptcyTotal), 0, BankruptcyTotal),
CollectionTotal = ifelse(is.na(CollectionTotal), 0, CollectionTotal),
OtherPublicTotal = ifelse(is.na(OtherPublicTotal), 0, OtherPublicTotal),
BKHist = ifelse(is.na(BKHist), FALSE, BKHist),
CollHist = ifelse(is.na(CollHist), FALSE, CollHist),
CollHist2Yr = ifelse(is.na(CollHist2Yr), FALSE, CollHist2Yr),
PublicHist2Yr = ifelse(is.na(PublicHist2Yr), FALSE, PublicHist2Yr),
Delq30Hist2Yr = ifelse(is.na(Delq30Hist2Yr), FALSE, Delq30Hist2Yr),
Delq60Hist2Yr = ifelse(is.na(Delq60Hist2Yr), FALSE, Delq60Hist2Yr),
Delq90Hist2Yr = ifelse(is.na(Delq90Hist2Yr), FALSE, Delq90Hist2Yr),
TrdDeptStore = ifelse(is.na(TrdDeptStore), FALSE, TrdDeptStore),
TrdBankCard = ifelse(is.na(TrdBankCard), 0, TrdBankCard),
TrdBank = ifelse(is.na(TrdBank), 0, TrdBank),
TrdAuto = ifelse(is.na(TrdAuto), 0, TrdAuto),
TrdOthFin = ifelse(is.na(TrdOthFin), 0, TrdOthFin),
TrdOil = ifelse(is.na(TrdOil), 0, TrdOil),
TrdMortgage = ifelse(is.na(TrdMortgage), 0, TrdMortgage),
TrdOther = ifelse(is.na(TrdOther), 0, TrdOther)) %>%
filter(!is.na(HousePaymentRentalAmount)) %>%
mutate(total_expense = HousePaymentRentalAmount + OtherExpense) %>%
dplyr::select(-HousePaymentRentalAmount, -OtherExpense) %>%
filter(!is.na(WghtAvgContractRate)) %>%
filter(!is.na(OrigZip))
tcu_cleaned_4 <- tcu_cleaned_4 %>%
mutate(VehicleMake = toupper(VehicleMake))
test5 <- tcu_cleaned_4 %>%
filter(is.na(InitialLTV)) %>%
select(VehicleMake)
table(test5$VehicleMake)
AUDI BUICK CHEVROLET CHEVROLET TRUCK
1 3 8 3
CHRYSLER DODGE DODGE TRUCK FORD
1 4 4 8
FORD TRUCK FORD TRUCKS GMC GMC TRUCKS
2 1 3 1
HONDA HYUNDAI JEEP KIA
5 3 5 1
MAZDA MERCURY NISSAN PONTIAC
1 3 1 2
SATURN SUBARU TOYOTA VOLKSWAGEN
2 1 4 1
unique(test5$VehicleMake)
[1] "TOYOTA" "CHEVROLET" "HYUNDAI"
[4] "CHEVROLET TRUCK" "MERCURY" "DODGE"
[7] "FORD" "GMC TRUCKS" "GMC"
[10] "DODGE TRUCK" "FORD TRUCK" "JEEP"
[13] "HONDA" "BUICK" "SUBARU"
[16] "VOLKSWAGEN" "PONTIAC" "SATURN"
[19] "MAZDA" "AUDI" "KIA"
[22] "CHRYSLER" "NISSAN" "FORD TRUCKS"
## Car
car <- c("AUDI", "BUICK", "CHEVROLET", "CHRYSLER", "DODGE", "FORD", "GMC", "HONDA", "HYUNDAI", "JEEP", "KIA", "MAZDA", "MERCURY", "NISSAN",
"PONTIAC", "SATURN", "SUBARU", "TOYOTA", "VOLKSWAGEN")
car <- data.frame(VehicleMake = car)
## fuzzy match?
library(stringdist)
library(fuzzyjoin)
fuzzy_join_vehicle <- stringdist_left_join(tcu_cleaned_4, car, by = "VehicleMake",
method = "jw", distance_col = "distance", max_dist = .2)
missing_ltv_result <- fuzzy_join_vehicle %>%
filter(!is.na(VehicleMake.y)) %>%
select(VehicleMake.x, VehicleMake.y, distance, InitialLTV) %>%
group_by(VehicleMake.y) %>%
summarize(mean_InitialLTV = mean(InitialLTV, na.rm = TRUE)) %>%
rename(VehicleMake = VehicleMake.y)
## remove the NA Initial lTV and then rbind
tcu_na_initial_ltv <- tcu_cleaned_4 %>%
filter(is.na(InitialLTV))
tcu_cleaned_5 <- tcu_cleaned_4 %>%
filter(!is.na(InitialLTV))
## Join the mean LTV to `tcu_na_initial_ltv`
final_impute_ltv <- stringdist_left_join(tcu_na_initial_ltv, missing_ltv_result, by = "VehicleMake",
method = "jw", distance_col = "distance", max_dist = .20)
final_impute_ltv_df <- final_impute_ltv %>%
#select(VehicleMake.x, VehicleMake.y, InitialLTV,mean_InitialLTV) %>%
mutate(mean_InitialLTV = ifelse(VehicleMake.x == "FORD TRUCKS" | VehicleMake.x == "FORD TRUCK", 194.72949, mean_InitialLTV)) %>%
mutate(mean_InitialLTV = ifelse(VehicleMake.x == "GMC TRUCKS", 82.97587,mean_InitialLTV)) %>%
## replace the NA values with mean_InitialLTV
mutate(InitialLTV = ifelse(is.na(InitialLTV), mean_InitialLTV, InitialLTV)) %>%
rename(VehicleMake = VehicleMake.x) %>%
dplyr::select(-VehicleMake.y, -mean_InitialLTV, -distance)
## get the mean LTV for those car models.
tcu_cleaned_6 <- rbind(tcu_cleaned_5, final_impute_ltv_df)
summary(tcu_cleaned_6$InitialLTV)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0 74.8 90.2 176.4 103.0 3080200.0
tcu_cleaned_7 <- tcu_cleaned_6 %>%
mutate(WL=ifelse(is.na(WelcomeLetter),0,1)) %>%
mutate(PL=ifelse(is.na(PreapprovalLetter),0,1)) %>%
mutate(PM=ifelse(is.na(PreapprovalMessage),0,1)) %>%
mutate(PC=ifelse(is.na(PreapprovalConvo),0,1)) %>%
mutate(WM=ifelse(is.na(WelcomeMessage),0,1)) %>%
mutate(WC=ifelse(is.na(WelcomeConvo),0,1)) %>%
mutate_at(c("WL", "PL", "PM","PC","WM", "WC"), as.factor)
tcu_cleaned_7 <- tcu_cleaned_7 %>%
select(-BaseXPMembershipID, -OriginationPurpose, -OrigBehavioralSegment, -OrigEngagementSegment, -OrigHousehold) %>%
mutate(OrigZip = as.character(OrigZip))
tcu_cleaned_7 <- tcu_cleaned_7 %>%
dplyr::select(-WelcomeLetter, -PreapprovalLetter, -PreapprovalMessage, -PreapprovalConvo, -WelcomeMessage, -WelcomeConvo) %>%
dplyr::rename(WelcomeLetter = WL) %>%
dplyr::rename(PreapprovalLetter = PL) %>%
dplyr::rename(PreapprovalMessage = PM) %>%
dplyr::rename(PreapprovalConvo = PC) %>%
dplyr::rename(WelcomeMessage = WM) %>%
dplyr::rename(WelcomeConvo = WC)
library(patchwork)
decision_histogram <- ggplot(tcu_cleaned_7, aes(x = DecisionCreditScore, fill = Converted)) +
geom_density(alpha = 0.5) +
theme_minimal() +
labs(title = "DecisionCreditScore") +
theme_bw() +
theme(legend.position = "none")
decision_boxplot <- ggplot(tcu_cleaned_7, aes(x = Converted, y = DecisionCreditScore, fill = Converted)) +
geom_boxplot() +
theme_minimal() +
theme_bw()
decision_histogram | decision_boxplot
#decision_histogram
age_histogram <- ggplot(tcu_cleaned_7, aes(x = AgeAtOrigination, fill = Converted)) +
geom_density(alpha = 0.5) +
theme_minimal() +
labs(title = "AgeAtOrigination") +
theme_bw()+
theme(legend.position = "none")
age_boxplot <- ggplot(tcu_cleaned_7, aes(x = Converted, y = AgeAtOrigination, fill = Converted)) +
geom_boxplot() +
theme_minimal() +
theme_bw()
age_histogram | age_boxplot
monthly_income_histogram <- tcu_cleaned_7 %>%
#filter(MonthlyIncome < 60000) %>%
ggplot(aes(x = MonthlyIncome, fill = Converted)) +
geom_density(alpha = 0.3) +
theme_minimal() +
labs(title = "MonthlyIncome") +
theme_bw() +
theme(legend.position = "none")
monthly_income_boxplot <- tcu_cleaned_7 %>%
#filter(MonthlyIncome < 20000) %>%
ggplot(aes(x = Converted, y = MonthlyIncome, fill = Converted)) +
geom_boxplot() +
theme_minimal() +
theme_bw()
monthly_income_histogram | monthly_income_boxplot
SumLoanPaymentTotals_histogram <- ggplot(tcu_cleaned_7, aes(x = SumLoanPaymentTotals, fill = Converted)) +
geom_density(alpha = 0.5) +
theme_minimal() +
ggtitle("SumLoanPaymentsTotal")+
theme_bw() +
theme(legend.position = "none")
SumLoanPaymentTotals_boxplot <- ggplot(tcu_cleaned_7, aes(x = Converted, y = SumLoanPaymentTotals, fill = Converted)) +
geom_boxplot() +
theme_minimal() +
theme_bw()
SumLoanPaymentTotals_histogram | SumLoanPaymentTotals_boxplot
## other
totalexpense_histogram <- tcu_cleaned_7 %>%
filter(total_expense < 6000) %>%
ggplot(aes(x = total_expense, fill = Converted)) +
geom_density(alpha = 0.5) +
theme_minimal() +
labs(title = "TotalExpense") +
theme_bw() +
theme(legend.position = "none")
totalexpense_boxplot <- tcu_cleaned_7 %>%
filter(total_expense < 6000) %>%
ggplot(aes(x = Converted, y = total_expense, fill = Converted)) +
geom_boxplot() +
theme_minimal() +
theme_bw()
totalexpense_histogram | totalexpense_boxplot
TrdBankCard_histogram <- ggplot(tcu_cleaned_7, aes(x = TrdBankCard, fill = Converted)) +
geom_histogram() +
theme_minimal()
TrdBankCard_boxplot <- ggplot(tcu_cleaned_7, aes(x = Converted, y = TrdBankCard, fill = Converted)) +
geom_boxplot() +
theme_minimal()
TrdBankCard_histogram | TrdBankCard_boxplot
AvgDaysToFirstPayment_histogram <- tcu_cleaned_7 %>%
filter(AvgDaysToFirstPayment <= 150) %>%
ggplot(aes(x = AvgDaysToFirstPayment, fill = Converted)) +
geom_density(alpha = 0.5) +
theme_minimal() +
labs(title = "AvgDaysToFirstPayment") +
theme(legend.position = "none")
AvgDaysToFirstPayment_boxplot <- tcu_cleaned_7 %>%
filter(AvgDaysToFirstPayment <= 150) %>%
ggplot(aes(x = Converted, y = AvgDaysToFirstPayment, fill = Converted)) +
geom_boxplot() +
theme_minimal()
AvgDaysToFirstPayment_histogram | AvgDaysToFirstPayment_boxplot
AvgTerm_histogram <- tcu_cleaned_7 %>%
ggplot(aes(x = AvgTerm, fill = Converted)) +
geom_density(alpha = 0.5) +
theme_minimal() +
labs(title = "AvgTerm") +
theme(legend.position = "none")
AvgTerm_boxplot <- tcu_cleaned_7 %>%
ggplot(aes(x = Converted, y = AvgTerm, fill = Converted)) +
geom_boxplot() +
theme_minimal()
AvgTerm_histogram | AvgTerm_boxplot
WghtAvgContractRate_histogram <- tcu_cleaned_7 %>%
ggplot(aes(x = WghtAvgContractRate, fill = Converted)) +
geom_density(alpha = 0.5) +
theme_minimal() +
labs(title = "WghtAvgContractRate") +
theme(legend.position = "none")
WghtAvgContractRate_boxplot <- tcu_cleaned_7 %>%
ggplot(aes(x = Converted, y = WghtAvgContractRate, fill = Converted)) +
geom_boxplot() +
theme_minimal()
WghtAvgContractRate_histogram | WghtAvgContractRate_boxplot
InitialLTV_histogram <- tcu_cleaned_7 %>%
filter(InitialLTV <= 200) %>%
ggplot(aes(x = InitialLTV, fill = Converted)) +
geom_density(alpha = 0.5) +
theme_minimal() +
labs(title = "InitialLTV") +
theme(legend.position = "none")
InitialLTV_boxplot <- tcu_cleaned_7 %>%
filter(InitialLTV <= 200) %>%
ggplot(aes(x = Converted, y = InitialLTV, fill = Converted)) +
geom_boxplot() +
theme_minimal()
InitialLTV_histogram | InitialLTV_boxplot
library(ggcorrplot)
numericVars <- which(sapply(tcu_cleaned_7, FUN = is.numeric)) #index vector numeric variables
numericVarNames <- names(numericVars) #saving names for use later on
cat("There are", length(numericVarNames), "numeric variables")
There are 40 numeric variables
categoricalVars <- which(sapply(tcu_cleaned_7, FUN = is.factor)) #index vector numeric variables
categoricalNames <- names(categoricalVars) #saving names for use later on
length(categoricalNames)
[1] 10
#categoricalNames
tcu_numericVars <- tcu_cleaned_7[, numericVars]
corr <- cor(tcu_numericVars, use = 'pairwise.complete.obs')
#corr[,'SumLoanPaymentTotals']
#corr[,'WghtAvgContractRate']
corr_df <- as.data.frame(corr)
new <- corr_df %>%
filter_all(any_vars (. >= 0.5))
ggcorrplot(corr, lab = FALSE)
Categorical variable overview.
tcu_cleaned_7 %>%
dplyr::select(names(.)[23:56]) %>%
purrr::keep(is.numeric) %>%
gather() %>%
ggplot() +
geom_histogram(mapping = aes(x=value,fill=key), color="black") +
facet_wrap(~ key, scales = "free") +
theme_minimal() +
theme(legend.position = "none")
tcu_cleaned_7 %>%
purrr::keep(is.factor) %>%
gather() %>%
group_by(key,value) %>%
summarise(n = n()) %>%
ggplot() +
geom_bar(mapping=aes(x = value, y = n, fill=key), color="black", stat='identity') +
coord_flip() +
facet_wrap(~ key, scales = "free") +
theme_minimal() +
theme(legend.position = "none") +
labs(x = "")
tcu_cleaned_8 <- tcu_cleaned_7 %>%
dplyr::select(-OriginationMonth, -BorrowerCreditScore, -Employer1Name, -EngagementChangeMonth, -ChangeEngagementSegment,
-ChangeBehavioralSegment, -VehicleMake, -VehicleModel, -VehicleYear)
tcu_cleaned_8 <- tcu_cleaned_8 %>%
select(-OrigZip, -SumScheduledPmts, -OrigBalanceIndirectLoans, -WghtAvgContractRate)
tcu_cleaned_8 %>%
count(Converted) %>%
mutate(pct = prop.table(n)) %>%
ggplot(aes(x = Converted, y = pct, fill = Converted, label = scales::percent(pct))) +
geom_col() +
geom_text(vjust = -0.5) +
scale_y_continuous(labels = scales::percent) +
theme_bw()
Welcome letter
chisq.test(tcu_cleaned_8$Converted,tcu_cleaned_8$WelcomeLetter)
Pearson's Chi-squared test with Yates' continuity correction
data: tcu_cleaned_8$Converted and tcu_cleaned_8$WelcomeLetter
X-squared = 963.6, df = 1, p-value < 2.2e-16
PreapprovalLetter
chisq.test(tcu_cleaned_8$Converted,tcu_cleaned_8$PreapprovalLetter)
Pearson's Chi-squared test with Yates' continuity correction
data: tcu_cleaned_8$Converted and tcu_cleaned_8$PreapprovalLetter
X-squared = 173.37, df = 1, p-value < 2.2e-16
BKHist
chisq.test(tcu_cleaned_8$Converted,tcu_cleaned_8$BKHist)
Pearson's Chi-squared test
data: tcu_cleaned_8$Converted and tcu_cleaned_8$BKHist
X-squared = 558.76, df = 2, p-value < 2.2e-16
CollHist
chisq.test(tcu_cleaned_8$Converted,tcu_cleaned_8$CollHist)
Pearson's Chi-squared test
data: tcu_cleaned_8$Converted and tcu_cleaned_8$CollHist
X-squared = 799.71, df = 2, p-value < 2.2e-16
CollHist2Yr
chisq.test(tcu_cleaned_8$Converted,tcu_cleaned_8$CollHist2Yr)
Pearson's Chi-squared test
data: tcu_cleaned_8$Converted and tcu_cleaned_8$CollHist2Yr
X-squared = 512.43, df = 2, p-value < 2.2e-16
library(ggmosaic)
ggplot(data = tcu_cleaned_8) +
geom_mosaic(aes(x = product(WelcomeLetter), fill=Converted), na.rm=TRUE)+
theme_bw()+
theme(legend.position = "none")+
labs(x="WelcomeLetter")+
labs(y="Converted")
ggplot(data = tcu_cleaned_8) +
geom_mosaic(aes(x = product(BKHist), fill=Converted), na.rm=TRUE)+
theme_bw()+
theme(legend.position = "none")+
labs(x="BKHist")+
labs(y="Converted")
ggplot(data = tcu_cleaned_8) +
geom_mosaic(aes(x = product(CollHist), fill=Converted), na.rm=TRUE)+
theme_bw()+
theme(legend.position = "none")+
labs(x="CollHist")+
labs(y="Converted")
ggplot(data = tcu_cleaned) +
geom_mosaic(aes(x = product(CollHist2Yr), fill=Converted), na.rm=TRUE)+
theme_bw()+
theme(legend.position = "none")+
labs(x="CollHist2Yr")+
labs(y="Converted")
ggplot(data = tcu_cleaned_8) +
geom_mosaic(aes(x = product(PreapprovalLetter), fill=Converted), na.rm=TRUE)+
theme_bw()+
theme(legend.position = "none")+
labs(x="PreapprovalLetter")+
labs(y="Converted")
We use the undersampling method to handle the data imbalanced problem.
library(caret)
set.seed(7231)
tcu_cleaned_converted <- tcu_cleaned_8[tcu_cleaned_8$Converted==1, ]
tcu_cleaned_nonconverted <- tcu_cleaned_8[tcu_cleaned_8$Converted==0, ]
tcu_nonconvertedsample <- caret::createDataPartition(y=tcu_cleaned_nonconverted$Converted,p=0.05,list = FALSE)
tcu_nonconverted <- tcu_cleaned_nonconverted[tcu_nonconvertedsample,]
df3 <- rbind(tcu_nonconverted,tcu_cleaned_converted)
set.seed(1234)
rows <- sample(nrow(df3))
df3 <- df3[rows,]
round(prop.table(table(df3$Converted)),3)
0 1
0.488 0.512
set.seed(1234)
sample.set <- df3 %>%
pull(.) %>%
caTools::sample.split(SplitRatio = .7)
tcuTrain3 <- subset(df3, sample.set == TRUE)
tcuTest3 <- subset(df3, sample.set == FALSE)
round(prop.table(table(tcuTrain3$Converted)),3)
0 1
0.488 0.512
#df3
library(tidyverse)
library(caret)
library(DMwR)
library(rpart)
library(ROCR)
library(randomForest)
library(xgboost)
library(caTools)
library(rpart.plot)
logit.mod <-
glm(Converted ~ ., family = binomial(link = 'logit'), data = tcuTrain3)
logit.pred.prob <- predict(logit.mod, tcuTest3, type = 'response')
logit.pred <- as.factor(ifelse(logit.pred.prob > 0.5, "1", "0"))
test <- tcuTest3$Converted
pred <- logit.pred
prob <- logit.pred.prob
roc.pred <- prediction(predictions = prob, labels = test)
roc.perf <- performance(roc.pred, measure = "tpr", x.measure = "fpr")
plot(roc.perf, main = "ROC Curve for Converted Prediction Approaches", col = 2, lwd = 2)
accuracy <- mean(test == pred)
precision <- posPredValue(as.factor(pred), as.factor(test), positive = "1")
recall <- sensitivity(as.factor(pred), as.factor(test), positive = "1")
fmeasure <- (2 * precision * recall)/(precision + recall)
confmat <- confusionMatrix(pred, test, positive = "1")
kappa <- as.numeric(confmat$overall["Kappa"])
auc <- as.numeric(performance(roc.pred, measure = "auc")@y.values)
comparisons <- tibble(approach="Logistic Regression", accuracy = accuracy, fmeasure = fmeasure, kappa = kappa, auc = auc)
#comparisons
set.seed(1234)
#grid search
ctrl <-
trainControl(method = "cv",
number = 10,
selectionFunction = "oneSE")
grid <-
expand.grid(
.model = "tree",
.trials = c(1, 5, 10, 15, 20, 25, 30, 35),
.winnow = FALSE
)
grid <-
expand.grid(
.cp = seq(from=0.0001, to=0.02, by=0.0001)
)
set.seed(1234)
tree.mod3 <-
train(
Converted ~ .,
data = tcuTrain3,
method = "rpart",
metric = "Kappa",
trControl = ctrl,
tuneGrid = grid,
na.action=na.omit
)
#tree.mod3
library(rpart)
library(rpart.plot)
#get optimal cp
tree_modbest <-
rpart(
Converted ~ .,
method = "class",
data = tcuTrain3,
control = rpart.control(cp = 0.0069)
)
#save(tree_modbest,file = "/Volumes/GoogleDrive/Shared drives/ND Capstone - TCU/04-01-2020/tree_model.RData")
tree.pred <- predict(tree_modbest, tcuTest3, type = "class")
tree.pred.prob <- predict(tree_modbest, tcuTest3, type = "prob")
test <- tcuTest3$Converted
pred <- tree.pred
prob <- tree.pred.prob[,c("1")]
roc.pred <- prediction(predictions = prob, labels = test)
roc.perf <- performance(roc.pred, measure = "tpr", x.measure = "fpr")
accuracy <- mean(test == pred)
precision <- posPredValue(as.factor(pred), as.factor(test), positive = "1")
recall <- sensitivity(as.factor(pred), as.factor(test), positive = "1")
fmeasure <- (2 * precision * recall)/(precision + recall)
confmat <- confusionMatrix(pred, test, positive = "1")
kappa <- as.numeric(confmat$overall["Kappa"])
auc <- as.numeric(performance(roc.pred, measure = "auc")@y.values)
comparisons <- comparisons %>%
add_row(approach="Classification Tree", accuracy = accuracy, fmeasure = fmeasure, kappa = kappa, auc = auc)
#from the summary, the first split is welcome letter>=1
#comparisons
grid <- expand.grid(.mtry = c(3, 6, 9))
ctrl <-
trainControl(method = "cv",
number = 3,
selectionFunction = "best")
set.seed(1234)
rf.mod <-
train(
Converted ~ .,
data = tcuTrain3,
method = "rf",
metric = "Kappa",
trControl = ctrl,
tuneGrid = grid,
na.action=na.omit
)
rf.pred <- predict(rf.mod, tcuTest3, type = "raw")
rf.pred.prob <- predict(rf.mod, tcuTest3, type = "prob")
test <- tcuTest3$Converted
pred <- rf.pred
prob <- rf.pred.prob[,c("1")]
roc.pred <- prediction(predictions = prob, labels = test)
roc.perf <- performance(roc.pred, measure = "tpr", x.measure = "fpr")
accuracy <- mean(test == pred)
precision <- posPredValue(as.factor(pred), as.factor(test), positive = "1")
recall <- sensitivity(as.factor(pred), as.factor(test), positive = "1")
fmeasure <- (2 * precision * recall)/(precision + recall)
confmat <- confusionMatrix(pred, test, positive = "1")
kappa <- as.numeric(confmat$overall["Kappa"])
auc <- as.numeric(performance(roc.pred, measure = "auc")@y.values)
comparisons <- comparisons %>%
add_row(approach="Random Forest", accuracy = accuracy, fmeasure = fmeasure, kappa = kappa, auc = auc)
#comparisons
ctrl <-
trainControl(method = "cv",
number = 3,
selectionFunction = "best")
grid <- expand.grid(
nrounds = 20,
max_depth = c(4, 6, 8),
eta = c(0.1, 0.3, 0.5),
gamma = 0.01,
colsample_bytree = 1,
min_child_weight = 1,
subsample = c(0.5, 1)
)
set.seed(1234)
xgb.mod <-
train(
Converted ~ .,
data = tcuTrain3,
method = "xgbTree",
metric = "Kappa",
trControl = ctrl,
tuneGrid = grid,
na.action=na.omit
)
xgb.pred <- predict(xgb.mod, tcuTest3, type = "raw")
xgb.pred.prob <- predict(xgb.mod, tcuTest3, type = "prob")
test <- tcuTest3$Converted
pred <- xgb.pred
prob <- xgb.pred.prob[,c("1")]
roc.pred <- prediction(predictions = prob, labels = test)
roc.perf <- performance(roc.pred, measure = "tpr", x.measure = "fpr")
accuracy <- mean(test == pred)
precision <- posPredValue(as.factor(pred), as.factor(test), positive = "1")
recall <- sensitivity(as.factor(pred), as.factor(test), positive = "1")
fmeasure <- (2 * precision * recall)/(precision + recall)
confmat <- confusionMatrix(pred, test, positive = "1")
kappa <- as.numeric(confmat$overall["Kappa"])
auc <- as.numeric(performance(roc.pred, measure = "auc")@y.values)
comparisons <- comparisons %>%
add_row(approach="Extreme Gradient Boosting", accuracy = accuracy, fmeasure = fmeasure, kappa = kappa, auc = auc)
comparisons
# A tibble: 4 x 5
approach accuracy fmeasure kappa auc
<chr> <dbl> <dbl> <dbl> <dbl>
1 Logistic Regression 0.633 0.636 0.266 0.686
2 Classification Tree 0.618 0.634 0.236 0.653
3 Random Forest 0.633 0.646 0.265 0.686
4 Extreme Gradient Boosting 0.639 0.654 0.278 0.695
The four models have similar performance. Therefore, we chose classification tree as it is easy to interpret and help us design business rules.
We use dt model to predict the probability of customers getting converted.
tree.pred4 <- predict(tree_modbest, tcu_cleaned_8, type = "prob")
prob <- tree.pred4[,c("1")]
tcu <- data.frame(tcu_cleaned_8,prob)
dt.pred.proball <- predict(tree_modbest, tcu_cleaned_8, type = "prob")
prob <- dt.pred.proball[,c("1")]
tcu <- data.frame(tcu_cleaned_8,prob)
final <- merge(tcu,tcu_cleaned,by.x =c("InitialLTV","SumLoanPaymentTotals","DecisionCreditScore","MonthlyIncome"),
by.y = c("InitialLTV","SumLoanPaymentTotals","DecisionCreditScore","MonthlyIncome"),all.y = TRUE
)
final <- final[!is.na(final$prob),]
tcu_list <- final %>%
filter(Converted.x == 0) %>%
dplyr::select(BaseXPMembershipID,prob)
#write.csv(tcu_list,"tcu_list")
Based on Converted Customers.
tcu_k_means <- tcu_cleaned_7 %>%
select(DecisionCreditScore,WghtAvgContractRate,AvgDaysToFirstPayment,InitialLTV,MonthlyIncome,
AgeAtOrigination,OrigBalanceIndirectLoans,SumScheduledPmts,SumLoanPaymentTotals,total_expense,
AvgTerm,TrdBankCard,WelcomeLetter,TrdOil,TrdAuto,TrdOther,
TrdOthFin,TrdDeptStore,ExtendedWarranties,TrdBank,TrdMortgage,Converted)
tcu_k_means_converted <- tcu_k_means %>%
select(-WelcomeLetter) %>%
filter(Converted == 1) %>%
na.omit()
tcu_k_means_converted = scale(tcu_k_means_converted %>% select(-Converted))
summary(tcu_k_means_converted)
DecisionCreditScore WghtAvgContractRate AvgDaysToFirstPayment
Min. :-3.9690 Min. :-2.0858 Min. :-1.7439
1st Qu.:-0.7649 1st Qu.:-0.6923 1st Qu.:-1.3477
Median :-0.1274 Median :-0.1827 Median : 0.5014
Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
3rd Qu.: 0.8288 3rd Qu.: 0.4660 3rd Qu.: 0.5014
Max. : 2.0198 Max. : 6.9528 Max. : 6.7090
InitialLTV MonthlyIncome AgeAtOrigination
Min. :-0.08868 Min. :-1.0765 Min. :-1.99981
1st Qu.:-0.02054 1st Qu.:-0.6701 1st Qu.:-0.80289
Median :-0.00863 Median :-0.2637 Median : 0.04198
Mean : 0.00000 Mean : 0.0000 Mean : 0.00000
3rd Qu.: 0.00041 3rd Qu.: 0.3459 3rd Qu.: 0.74605
Max. :90.18255 Max. : 6.1577 Max. : 3.28069
OrigBalanceIndirectLoans SumScheduledPmts SumLoanPaymentTotals
Min. :-2.1235 Min. :-2.6509 Min. :-2.1012
1st Qu.:-0.7035 1st Qu.:-0.7123 1st Qu.:-0.7050
Median :-0.1506 Median :-0.1676 Median :-0.1444
Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
3rd Qu.: 0.5645 3rd Qu.: 0.5386 3rd Qu.: 0.5514
Max. : 5.6967 Max. : 5.8294 Max. : 5.9063
total_expense AvgTerm TrdBankCard
Min. :-0.60141 Min. :-5.5382 Min. :-1.1116
1st Qu.:-0.18587 1st Qu.:-0.7541 1st Qu.:-0.6650
Median :-0.09353 Median : 0.4419 Median :-0.2184
Mean : 0.00000 Mean : 0.0000 Mean : 0.0000
3rd Qu.: 0.19272 3rd Qu.: 0.7409 3rd Qu.: 0.4515
Max. :61.00133 Max. : 1.6379 Max. :10.2763
TrdOil TrdAuto TrdOther
Min. :-0.3262 Min. :-0.9910 Min. :-0.7544
1st Qu.:-0.3262 1st Qu.:-0.5665 1st Qu.:-0.7544
Median :-0.3262 Median :-0.1419 Median :-0.1836
Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
3rd Qu.:-0.3262 3rd Qu.: 0.2826 3rd Qu.: 0.3871
Max. :14.5016 Max. : 7.5002 Max. :15.5116
TrdOthFin TrdDeptStore ExtendedWarranties
Min. :-0.7163 Min. :-0.8099 Min. :-1.097
1st Qu.:-0.7163 1st Qu.:-0.8099 1st Qu.:-1.097
Median :-0.4471 Median :-0.3280 Median : 0.898
Mean : 0.0000 Mean : 0.0000 Mean : 0.000
3rd Qu.: 0.3605 3rd Qu.: 0.4751 3rd Qu.: 0.898
Max. : 8.1670 Max. : 6.8998 Max. : 2.893
TrdBank TrdMortgage
Min. :-0.5653 Min. :-0.8990
1st Qu.:-0.5653 1st Qu.:-0.8990
Median :-0.5653 Median :-0.4037
Mean : 0.0000 Mean : 0.0000
3rd Qu.: 0.4304 3rd Qu.: 0.5867
Max. :16.3617 Max. : 8.0154
ncol(tcu_k_means_converted)
[1] 20
We got 20 variables. As a rule of thumb, we use sqrt(n/2) as a starting point. In this case, it would be 3.
set.seed(1234)
k_3 <- kmeans(tcu_k_means_converted, centers=3, nstart = 25)
k_3$size
[1] 1991 2768 3381
k_3$centers
DecisionCreditScore WghtAvgContractRate AvgDaysToFirstPayment
1 -0.0979945 -0.08322341 0.09794321
2 0.6235013 -0.46268783 -0.09351241
3 -0.4527491 0.42780766 0.01888123
InitialLTV MonthlyIncome AgeAtOrigination
1 -0.006225711 0.52267366 0.03537167
2 -0.018844764 0.06726442 0.47981243
3 0.019094261 -0.36286044 -0.41364857
OrigBalanceIndirectLoans SumScheduledPmts SumLoanPaymentTotals
1 1.3323512 1.2841754 1.3166601
2 -0.5505496 -0.4892673 -0.6108832
3 -0.3338628 -0.3556644 -0.2752279
total_expense AvgTerm TrdBankCard TrdOil TrdAuto
1 0.181580591 0.4814631 0.0794628 -0.01750233 0.4432106
2 0.004420715 -0.5561747 0.6135295 0.28006800 0.1505790
3 -0.110548209 0.1718126 -0.5490861 -0.21898287 -0.3842754
TrdOther TrdOthFin TrdDeptStore ExtendedWarranties TrdBank
1 0.1556281 0.16107022 0.03104751 0.20862369 0.1040744
2 0.1263763 -0.04640037 0.39475870 -0.27033360 0.2800284
3 -0.1951095 -0.05686323 -0.34146929 0.09846603 -0.2905444
TrdMortgage
1 0.1437406
2 0.5668903
3 -0.5487548
library(factoextra)
fviz_cluster(k_3, data = tcu_k_means_converted)
# Let's see how varying the number of clusters affects the results.
k_4 <- kmeans(tcu_k_means_converted, centers = 4, nstart = 25)
k_5 <- kmeans(tcu_k_means_converted, centers = 5, nstart = 25)
k_6 <- kmeans(tcu_k_means_converted, centers = 6, nstart = 25)
# Plot and compare the results.
p1 <- fviz_cluster(k_3, geom = "point", data = tcu_k_means_converted) + ggtitle("k = 3")
p2 <- fviz_cluster(k_4, geom = "point", data = tcu_k_means_converted) + ggtitle("k = 4")
p3 <- fviz_cluster(k_5, geom = "point", data = tcu_k_means_converted) + ggtitle("k = 5")
p4 <- fviz_cluster(k_6, geom = "point", data = tcu_k_means_converted) + ggtitle("k = 6")
library(gridExtra)
grid.arrange(p1, p2, p3, p4, nrow = 2)
wcss <- vector()
# ... then specify the loop that generates the values.
n = 20
set.seed(1234)
for(k in 1:n) {
wcss[k] <- sum(kmeans(tcu_k_means_converted, k)$withinss)
}
wcss
[1] 162780.00 145691.12 134792.38 129051.95 124750.56 121314.04
[7] 118522.71 114970.77 109303.47 105363.97 107395.73 105200.90
[13] 103700.98 96419.28 100129.89 99783.08 98031.89 96494.71
[19] 95581.65 94746.20
# Visualize the values of WCSS as they relate to number of clusters
tibble(value = wcss) %>%
ggplot(mapping=aes(x=seq(1,length(wcss)), y=value)) +
geom_point()+
geom_line() +
labs(title = "The Elbow Method", y = "WCSS", x = "Number of Clusters (k)" ) +
theme_minimal()
# According to the elbow method, we should choose 10 clusters.
k_10 <- kmeans(tcu_k_means_converted, centers = 10, nstart = 25, iter.max = 30)
k_10$centers
DecisionCreditScore WghtAvgContractRate AvgDaysToFirstPayment
1 -0.59707944 0.31124978 0.178117369
2 -0.27756955 0.07508951 -0.023471386
3 0.33628211 -0.42660130 0.032571571
4 0.67436393 -0.36423994 -0.069363917
5 0.06598318 -0.27272917 0.114950170
6 -0.28196961 0.02423280 0.070295606
7 -1.44405543 2.28782095 -0.005310404
8 -0.22522763 -0.02014959 0.011096255
9 0.84421759 -0.57540761 -0.246093767
10 -1.44147835 1.53169822 0.501401287
InitialLTV MonthlyIncome AgeAtOrigination
1 -0.002900525 -0.03442677 -0.2032114
2 -0.012262867 -0.08069914 0.6093515
3 -0.013714418 0.53822037 0.2519443
4 -0.023989595 -0.01412350 0.3044887
5 -0.007203508 1.08977144 0.1277190
6 -0.006602262 -0.18383600 -0.6084900
7 -0.007987302 -0.45451545 -0.3308641
8 -0.009001840 -0.40167804 -0.7831764
9 -0.014084744 -0.21702592 0.8222129
10 30.046975526 0.60278898 0.2297352
OrigBalanceIndirectLoans SumScheduledPmts SumLoanPaymentTotals
1 0.75394612 0.66772688 0.8325553
2 -0.24551118 -0.24477049 -0.2337145
3 -0.05837050 -0.08093904 -0.1210659
4 -1.05993068 -0.79791938 -1.1086281
5 2.07219818 2.07587422 1.9964984
6 -0.07892526 -0.17239726 -0.0617731
7 -0.62656811 -0.35058122 -0.3800873
8 -0.49454637 -0.58626698 -0.4856928
9 -0.04019152 -0.13571835 -0.1215228
10 0.46531718 0.85797657 0.8009348
total_expense AvgTerm TrdBankCard TrdOil TrdAuto
1 -0.01621926 0.66621442 -0.410394336 -0.203023108 -0.01366398
2 -0.03030366 -0.06067671 0.751604183 3.940916602 0.28677614
3 0.15665460 -0.04765249 1.313113192 0.008754065 0.76079949
4 -0.03357277 -1.50088056 0.104099619 -0.112779395 -0.09261898
5 0.23335347 0.38140072 0.123172159 -0.071895240 0.59690223
6 -0.04817686 0.33929628 -0.130826385 -0.145628003 -0.03062752
7 -0.11559744 -0.19496418 -0.551598333 -0.237147326 -0.20615814
8 -0.11889751 0.19072431 -0.569406057 -0.250354665 -0.49006615
9 -0.07966413 0.13338369 -0.008809614 -0.174689395 -0.24405517
10 36.86208019 0.30897678 -0.292839143 0.123095733 -0.28344055
TrdOther TrdOthFin TrdDeptStore ExtendedWarranties TrdBank
1 -0.2229764 -0.15223557 -0.25474512 0.36024460 -0.23482542
2 0.1304586 0.17608253 0.75875800 0.09481819 0.82640979
3 0.3873875 0.09291122 0.89242092 -0.20263670 0.73919797
4 -0.1770579 -0.23135844 -0.10759834 -0.77347978 0.01137375
5 0.1021508 0.07020825 -0.06530054 0.08436376 0.13052389
6 1.7092130 2.27355955 0.13948428 0.05738641 -0.08969494
7 -0.3032368 -0.21205767 -0.35009262 0.14163722 -0.20166435
8 -0.3242320 -0.28114299 -0.37099803 -0.08079298 -0.29580149
9 -0.2094304 -0.31797884 0.01594123 0.36319081 -0.16374347
10 -0.2787531 0.53995272 -0.80988084 0.23300717 -0.23341143
TrdMortgage
1 -0.40719310
2 0.43560386
3 1.16765429
4 0.25880458
5 0.29779714
6 -0.31238464
7 -0.47436688
8 -0.60300887
9 0.03543928
10 -0.56882237
fviz_cluster(k_10, geom = "point", data = tcu_k_means_converted) + ggtitle("k = 10")
converted_customers <- tcu_k_means %>% filter(Converted == 1) %>% select(-WelcomeLetter)
converted_customers$cluster <- k_10$cluster
converted_customers %>%
group_by(cluster) %>%
summarize(DecisionCreditScore = mean(DecisionCreditScore),
WghtAvgContractRate = mean(WghtAvgContractRate),
AvgDaysToFirstPayment = mean(AvgDaysToFirstPayment),
InitialLTV = mean(InitialLTV),
MonthlyIncome = mean(MonthlyIncome),
AgeAtOrigination = mean(AgeAtOrigination),
OrigBalanceIndirectLoans = mean(OrigBalanceIndirectLoans),
SumScheduledPmts = mean(SumScheduledPmts),
SumLoanPaymentTotals = mean(SumLoanPaymentTotals),
total_expense = mean(total_expense),
AvgTerm = mean(AvgTerm)) %>%
as.data.frame()
cluster DecisionCreditScore WghtAvgContractRate
1 1 694.0023 5.665988
2 2 713.0487 5.156299
3 3 749.6412 4.073533
4 4 769.7946 4.208123
5 5 733.5283 4.405625
6 6 712.7864 5.046538
7 7 643.5130 9.931889
8 8 716.1689 4.950751
9 9 779.9198 3.752374
10 10 643.6667 8.300000
AvgDaysToFirstPayment InitialLTV MonthlyIncome AgeAtOrigination
1 42.55229 100.44109 4564.117 43.51743
2 41.02597 89.47858 4450.263 55.05844
3 41.45030 87.77893 5973.128 49.98211
4 40.67850 75.74757 4614.074 50.72841
5 42.07402 95.40266 7330.231 48.21771
6 41.73592 96.10667 4196.493 37.76117
7 41.16348 94.48490 3530.480 41.70435
8 41.28770 93.29697 3660.488 35.28006
9 39.34041 87.34531 4114.828 58.08176
10 45.00000 35286.31304 6132.000 49.66667
OrigBalanceIndirectLoans SumScheduledPmts SumLoanPaymentTotals
1 28655.49 460.3555 34110.02
2 19501.04 333.7753 22615.48
3 21215.14 356.5017 23829.85
4 12041.43 257.0433 13183.79
5 40729.91 655.6916 46657.49
6 21026.87 343.8148 24469.03
7 16010.78 319.0974 21037.56
8 17220.03 286.4034 19899.12
9 21381.65 348.9028 23824.92
10 26011.82 486.7467 33769.15
total_expense AvgTerm
1 633.7266 74.25097
2 618.4740 66.95779
3 820.9384 67.08847
4 614.9338 52.50768
5 903.9985 71.39332
6 599.1184 70.97087
7 526.1061 65.61043
8 522.5323 69.48019
9 565.0197 68.90487
10 40570.6667 70.66667
cluster_rows = converted_customers %>%
group_by(cluster) %>%
summarise(n = n())
cluster_rows %>%
mutate(cluster = as.factor(cluster),
percent = n/sum(n))
# A tibble: 10 x 3
cluster n percent
<fct> <int> <dbl>
1 1 1291 0.159
2 2 308 0.0378
3 3 1006 0.124
4 4 1042 0.128
5 5 689 0.0846
6 6 515 0.0633
7 7 575 0.0706
8 8 1439 0.177
9 9 1272 0.156
10 10 3 0.000369
head(tcu_k_means)
DecisionCreditScore WghtAvgContractRate AvgDaysToFirstPayment
1 762 4.90 45
2 716 5.84 46
3 781 4.40 45
4 790 4.50 45
5 751 3.90 45
6 700 6.99 44
InitialLTV MonthlyIncome AgeAtOrigination OrigBalanceIndirectLoans
1 77.84608 4166 25 15499.60
2 100.40390 4892 76 18726.98
3 116.05114 5416 31 19639.35
4 57.89504 4583 50 11373.69
5 69.14370 3061 23 12112.18
6 83.85382 4128 47 11625.32
SumScheduledPmts SumLoanPaymentTotals total_expense AvgTerm
1 292.37 17542.20 0 60
2 299.29 22446.75 672 75
3 365.85 21951.00 1200 60
4 212.41 12744.60 1145 60
5 222.85 13371.00 648 60
6 198.72 14307.84 912 72
TrdBankCard WelcomeLetter TrdOil TrdAuto TrdOther TrdOthFin
1 5 0 0 1 5 5
2 1 0 0 3 2 21
3 15 0 0 6 1 5
4 5 0 1 1 0 1
5 13 0 2 1 4 6
6 9 0 1 3 1 4
TrdDeptStore ExtendedWarranties TrdBank TrdMortgage Converted
1 3 0 1 1 0
2 0 0 1 5 0
3 4 0 1 2 0
4 2 1 1 8 0
5 1 0 3 4 0
6 6 0 9 5 0
Discussion: The most valuable customer clusters are cluster 3 and 5 for the following reasons.